In [1]:
# Datasets available at:
# UCI, Machine Learning Repository: http://archive.ics.uci.edu/ml/datasets/Wine+Quality
# Data Folder: http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
# 1. White wine: http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv
# 2. Red Wine: http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
In [2]:
# Importing: Pandas, NumPy, Matplotlib, Seaborn and Scikit-Learn libraries.
import pandas as pd
import numpy as ny
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
#
In [3]:
# Importing both the white wine and red wine datasets.
white_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')
red_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
In [4]:
white_wine.head()
Out[4]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.0 0.27 0.36 20.7 0.045 45.0 170.0 1.0010 3.00 0.45 8.8 6
1 6.3 0.30 0.34 1.6 0.049 14.0 132.0 0.9940 3.30 0.49 9.5 6
2 8.1 0.28 0.40 6.9 0.050 30.0 97.0 0.9951 3.26 0.44 10.1 6
3 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40 9.9 6
4 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40 9.9 6
In [5]:
white_wine.tail()
Out[5]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
4893 6.2 0.21 0.29 1.6 0.039 24.0 92.0 0.99114 3.27 0.50 11.2 6
4894 6.6 0.32 0.36 8.0 0.047 57.0 168.0 0.99490 3.15 0.46 9.6 5
4895 6.5 0.24 0.19 1.2 0.041 30.0 111.0 0.99254 2.99 0.46 9.4 6
4896 5.5 0.29 0.30 1.1 0.022 20.0 110.0 0.98869 3.34 0.38 12.8 7
4897 6.0 0.21 0.38 0.8 0.020 22.0 98.0 0.98941 3.26 0.32 11.8 6
In [6]:
print(white_wine.shape)
white_wine.info()
white_wine.describe()
(4898, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
Out[6]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000
mean 6.854788 0.278241 0.334192 6.391415 0.045772 35.308085 138.360657 0.994027 3.188267 0.489847 10.514267 5.877909
std 0.843868 0.100795 0.121020 5.072058 0.021848 17.007137 42.498065 0.002991 0.151001 0.114126 1.230621 0.885639
min 3.800000 0.080000 0.000000 0.600000 0.009000 2.000000 9.000000 0.987110 2.720000 0.220000 8.000000 3.000000
25% 6.300000 0.210000 0.270000 1.700000 0.036000 23.000000 108.000000 0.991723 3.090000 0.410000 9.500000 5.000000
50% 6.800000 0.260000 0.320000 5.200000 0.043000 34.000000 134.000000 0.993740 3.180000 0.470000 10.400000 6.000000
75% 7.300000 0.320000 0.390000 9.900000 0.050000 46.000000 167.000000 0.996100 3.280000 0.550000 11.400000 6.000000
max 14.200000 1.100000 1.660000 65.800000 0.346000 289.000000 440.000000 1.038980 3.820000 1.080000 14.200000 9.000000
In [7]:
red_wine.head()
Out[7]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
In [8]:
red_wine.tail()
Out[8]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
1594 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5
1595 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 6
1596 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6
1597 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 5
1598 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0 6
In [9]:
print(red_wine.shape)
red_wine.info()
red_wine.describe()
(1599, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
Out[9]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000
mean 8.319637 0.527821 0.270976 2.538806 0.087467 15.874922 46.467792 0.996747 3.311113 0.658149 10.422983 5.636023
std 1.741096 0.179060 0.194801 1.409928 0.047065 10.460157 32.895324 0.001887 0.154386 0.169507 1.065668 0.807569
min 4.600000 0.120000 0.000000 0.900000 0.012000 1.000000 6.000000 0.990070 2.740000 0.330000 8.400000 3.000000
25% 7.100000 0.390000 0.090000 1.900000 0.070000 7.000000 22.000000 0.995600 3.210000 0.550000 9.500000 5.000000
50% 7.900000 0.520000 0.260000 2.200000 0.079000 14.000000 38.000000 0.996750 3.310000 0.620000 10.200000 6.000000
75% 9.200000 0.640000 0.420000 2.600000 0.090000 21.000000 62.000000 0.997835 3.400000 0.730000 11.100000 6.000000
max 15.900000 1.580000 1.000000 15.500000 0.611000 72.000000 289.000000 1.003690 4.010000 2.000000 14.900000 8.000000
In [10]:
# There are a total of 4898 samples of the white wine variant and 1599 samples of the red wine variant. 
# Both the red and white wine samples are qualified by a set of eleven physicochemical features. 
# These physicochemical attributes are: fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, 
# free sulfur dioxide, total sulfur dioxide, density, pH, sulphates and alcohol content. 
# All these eleven attributes would be continuously distributed and be of the ‘float’ datatype. 
# 
# The twelfth attribute, the quality attribute, can be considered as a sensory information indicator, 
# indicating a quality score between 0 and 10 and would be of the ‘integer’ datatype. 
# This quality attribute is our attribute of interest, being attempted to classify and predict.
In [11]:
wine_data_all = pd.concat([white_wine, red_wine], axis=0)
In [12]:
print(wine_data_all.shape)
wine_data_all.info()
wine_data_all.describe()
(6497, 12)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 659.9 KB
Out[12]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000
mean 7.215307 0.339666 0.318633 5.443235 0.056034 30.525319 115.744574 0.994697 3.218501 0.531268 10.491801 5.818378
std 1.296434 0.164636 0.145318 4.757804 0.035034 17.749400 56.521855 0.002999 0.160787 0.148806 1.192712 0.873255
min 3.800000 0.080000 0.000000 0.600000 0.009000 1.000000 6.000000 0.987110 2.720000 0.220000 8.000000 3.000000
25% 6.400000 0.230000 0.250000 1.800000 0.038000 17.000000 77.000000 0.992340 3.110000 0.430000 9.500000 5.000000
50% 7.000000 0.290000 0.310000 3.000000 0.047000 29.000000 118.000000 0.994890 3.210000 0.510000 10.300000 6.000000
75% 7.700000 0.400000 0.390000 8.100000 0.065000 41.000000 156.000000 0.996990 3.320000 0.600000 11.300000 6.000000
max 15.900000 1.580000 1.660000 65.800000 0.611000 289.000000 440.000000 1.038980 4.010000 2.000000 14.900000 9.000000
In [13]:
print(wine_data_all)
      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.0             0.270         0.36            20.7      0.045   
1               6.3             0.300         0.34             1.6      0.049   
2               8.1             0.280         0.40             6.9      0.050   
3               7.2             0.230         0.32             8.5      0.058   
4               7.2             0.230         0.32             8.5      0.058   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.067   

      free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                    45.0                 170.0  1.00100  3.00       0.45   
1                    14.0                 132.0  0.99400  3.30       0.49   
2                    30.0                  97.0  0.99510  3.26       0.44   
3                    47.0                 186.0  0.99560  3.19       0.40   
4                    47.0                 186.0  0.99560  3.19       0.40   
...                   ...                   ...      ...   ...        ...   
1594                 32.0                  44.0  0.99490  3.45       0.58   
1595                 39.0                  51.0  0.99512  3.52       0.76   
1596                 29.0                  40.0  0.99574  3.42       0.75   
1597                 32.0                  44.0  0.99547  3.57       0.71   
1598                 18.0                  42.0  0.99549  3.39       0.66   

      alcohol  quality  
0         8.8        6  
1         9.5        6  
2        10.1        6  
3         9.9        6  
4         9.9        6  
...       ...      ...  
1594     10.5        5  
1595     11.2        6  
1596     11.0        6  
1597     10.2        5  
1598     11.0        6  

[6497 rows x 12 columns]
In [14]:
# This wine_data_all is the consolidation of the white wine (4898 samples) and red wine (1599 samples) samples, 
# and this consolidated dataset of 4898 + 1599 = 6497 entries would be the sample set, used for our analysis.
In [15]:
# Check for missing values.
wine_data_all.isnull().sum()
Out[15]:
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
In [16]:
# This consolidated dataset would have no missing values. 
In [17]:
# Distribution of the eleven physicochemical features, namely: fixed acidity, volatile acidity, citric acid, residual sugar, 
# chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates and alcohol content. 
plt.hist(wine_data_all['fixed acidity'])
plt.xlabel("Fixed Acidity")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['volatile acidity'])
plt.xlabel("Volatile Acidity")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['citric acid'])
plt.xlabel("Citric Acid")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['residual sugar'])
plt.xlabel("Residual Sugar")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['chlorides'])
plt.xlabel("Chlorides")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['free sulfur dioxide'])
plt.xlabel("Free Sulfur Dioxide")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['total sulfur dioxide'])
plt.xlabel("Total Sulfur Dioxide")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['density'])
plt.xlabel("Density")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['pH'])
plt.xlabel("pH")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['sulphates'])
plt.xlabel("Sulphates")
plt.ylabel("Frequency distribution")
plt.show()
plt.hist(wine_data_all['alcohol'])
plt.xlabel("Alcohol Content")
plt.ylabel("Frequency distribution")
plt.show()
In [18]:
# Distribution of all the attributes in the consolidated dataset.
wine_data_all.hist(bins=25, figsize=(20, 20))
plt.show()
In [19]:
# We can see that all the attributes are following different distributions, also have different ranges and scale.
# We can also see that there would be no outliers, which would need actioning.
In [20]:
wine_data_all.describe()
Out[20]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000
mean 7.215307 0.339666 0.318633 5.443235 0.056034 30.525319 115.744574 0.994697 3.218501 0.531268 10.491801 5.818378
std 1.296434 0.164636 0.145318 4.757804 0.035034 17.749400 56.521855 0.002999 0.160787 0.148806 1.192712 0.873255
min 3.800000 0.080000 0.000000 0.600000 0.009000 1.000000 6.000000 0.987110 2.720000 0.220000 8.000000 3.000000
25% 6.400000 0.230000 0.250000 1.800000 0.038000 17.000000 77.000000 0.992340 3.110000 0.430000 9.500000 5.000000
50% 7.000000 0.290000 0.310000 3.000000 0.047000 29.000000 118.000000 0.994890 3.210000 0.510000 10.300000 6.000000
75% 7.700000 0.400000 0.390000 8.100000 0.065000 41.000000 156.000000 0.996990 3.320000 0.600000 11.300000 6.000000
max 15.900000 1.580000 1.660000 65.800000 0.611000 289.000000 440.000000 1.038980 4.010000 2.000000 14.900000 9.000000
In [21]:
# Distribution and details of our attribute of interest, the quality attribute.
plt.hist(wine_data_all.quality)
plt.xlabel("Quality")
plt.ylabel("Frequency distribution of wine quality")
plt.show()
wine_data_all.quality.describe()
print(wine_data_all['quality'].value_counts())
6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: quality, dtype: int64
In [22]:
# It’s interesting to note that 6053 (2836 + 2138 + 1079) entries out of the total 6497 entries, 
# about 93%, would have a quality rating of either 5 / 6 or 7. 
In [23]:
wine_data_all.quality.describe()
Out[23]:
count    6497.000000
mean        5.818378
std         0.873255
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         9.000000
Name: quality, dtype: float64
In [24]:
# Check for correlations between the attributes
sns.heatmap(wine_data_all.corr(), cmap='coolwarm')
Out[24]:
<AxesSubplot:>
In [25]:
wine_data_all.corr()
Out[25]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
fixed acidity 1.000000 0.219008 0.324436 -0.111981 0.298195 -0.282735 -0.329054 0.458910 -0.252700 0.299568 -0.095452 -0.076743
volatile acidity 0.219008 1.000000 -0.377981 -0.196011 0.377124 -0.352557 -0.414476 0.271296 0.261454 0.225984 -0.037640 -0.265699
citric acid 0.324436 -0.377981 1.000000 0.142451 0.038998 0.133126 0.195242 0.096154 -0.329808 0.056197 -0.010493 0.085532
residual sugar -0.111981 -0.196011 0.142451 1.000000 -0.128940 0.402871 0.495482 0.552517 -0.267320 -0.185927 -0.359415 -0.036980
chlorides 0.298195 0.377124 0.038998 -0.128940 1.000000 -0.195045 -0.279630 0.362615 0.044708 0.395593 -0.256916 -0.200666
free sulfur dioxide -0.282735 -0.352557 0.133126 0.402871 -0.195045 1.000000 0.720934 0.025717 -0.145854 -0.188457 -0.179838 0.055463
total sulfur dioxide -0.329054 -0.414476 0.195242 0.495482 -0.279630 0.720934 1.000000 0.032395 -0.238413 -0.275727 -0.265740 -0.041385
density 0.458910 0.271296 0.096154 0.552517 0.362615 0.025717 0.032395 1.000000 0.011686 0.259478 -0.686745 -0.305858
pH -0.252700 0.261454 -0.329808 -0.267320 0.044708 -0.145854 -0.238413 0.011686 1.000000 0.192123 0.121248 0.019506
sulphates 0.299568 0.225984 0.056197 -0.185927 0.395593 -0.188457 -0.275727 0.259478 0.192123 1.000000 -0.003029 0.038485
alcohol -0.095452 -0.037640 -0.010493 -0.359415 -0.256916 -0.179838 -0.265740 -0.686745 0.121248 -0.003029 1.000000 0.444319
quality -0.076743 -0.265699 0.085532 -0.036980 -0.200666 0.055463 -0.041385 -0.305858 0.019506 0.038485 0.444319 1.000000
In [26]:
# With respect to the quality attribute, strongest correlations seen between:
# quality / alcohol (0.444319: positively correlated)
# quality / density (-0.305858: negatively correlated)
# quality / volatile acidity (-0.265699: negatively correlated) 
# quality / chlorides (-0.200666: negatively correlated) pairs.
#
# As regards the other set of attributes, strongest correlations seen between:
# free sulfur dioxide / total sulfur dioxide (0.720934: positively correlated)
# alcohol / density (-0.686745: negatively correlated)
# density / residual sugar (0.552517: positively correlated)
# residual sugar / total sulfur dioxide (0.495482: positively correlated) 
# density / fixed acidity (0.458910: positively correlated) pairs.
In [27]:
# Creating four distinct quality labels: low, medium, high and excellent
# These four levels would be:
# Level 1: Low (for quality attribute values: 0, 1 and 2)
# Level 2: Medium (for quality attribute values: 3, 4 and 5)
# Level 3: High (for quality attribute values: 6, 7 and 8)
# Level 4: Excellent (for quality attribute values: 9 and 10)
wine_data_all_4levels = wine_data_all.copy()
bins = [0, 2.5, 5.5, 8.5, 10] 
labels = [1, 2, 3, 4]
wine_data_all_4levels['quality'] = pd.cut(wine_data_all_4levels['quality'], bins=bins, labels=labels)
In [28]:
print(wine_data_all_4levels['quality'].value_counts())
wine_data_all_4levels.quality.describe()
3    4108
2    2384
4       5
1       0
Name: quality, dtype: int64
Out[28]:
count     6497
unique       3
top          3
freq      4108
Name: quality, dtype: int64
In [30]:
wine_data_all_n = wine_data_all_4levels.copy()
wine_data_all_n.info()
wine_data_all_n['quality']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   fixed acidity         6497 non-null   float64 
 1   volatile acidity      6497 non-null   float64 
 2   citric acid           6497 non-null   float64 
 3   residual sugar        6497 non-null   float64 
 4   chlorides             6497 non-null   float64 
 5   free sulfur dioxide   6497 non-null   float64 
 6   total sulfur dioxide  6497 non-null   float64 
 7   density               6497 non-null   float64 
 8   pH                    6497 non-null   float64 
 9   sulphates             6497 non-null   float64 
 10  alcohol               6497 non-null   float64 
 11  quality               6497 non-null   category
dtypes: category(1), float64(11)
memory usage: 615.6 KB
Out[30]:
0       3
1       3
2       3
3       3
4       3
       ..
1594    2
1595    3
1596    3
1597    2
1598    3
Name: quality, Length: 6497, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]
In [31]:
wine_data_all_n.describe()
Out[31]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol
count 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000
mean 7.215307 0.339666 0.318633 5.443235 0.056034 30.525319 115.744574 0.994697 3.218501 0.531268 10.491801
std 1.296434 0.164636 0.145318 4.757804 0.035034 17.749400 56.521855 0.002999 0.160787 0.148806 1.192712
min 3.800000 0.080000 0.000000 0.600000 0.009000 1.000000 6.000000 0.987110 2.720000 0.220000 8.000000
25% 6.400000 0.230000 0.250000 1.800000 0.038000 17.000000 77.000000 0.992340 3.110000 0.430000 9.500000
50% 7.000000 0.290000 0.310000 3.000000 0.047000 29.000000 118.000000 0.994890 3.210000 0.510000 10.300000
75% 7.700000 0.400000 0.390000 8.100000 0.065000 41.000000 156.000000 0.996990 3.320000 0.600000 11.300000
max 15.900000 1.580000 1.660000 65.800000 0.611000 289.000000 440.000000 1.038980 4.010000 2.000000 14.900000
In [32]:
# Feature scaling, using Min-Max normalization technique (Normalizing numeric attributes)
#
def normalize(x):
  return ((x - min(x)) / (max(x) - min(x))) 
#
#
X = list(set(list(wine_data_all_4levels)) - set(['quality']))
#
wine_data_all_n = wine_data_all_4levels.iloc[:,].copy()
wine_data_all_n[X] = wine_data_all_n[X].apply(normalize)
#
print(wine_data_all_n.shape)
wine_data_all_n.info()
wine_data_all_n.describe()
#
(6497, 12)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   fixed acidity         6497 non-null   float64 
 1   volatile acidity      6497 non-null   float64 
 2   citric acid           6497 non-null   float64 
 3   residual sugar        6497 non-null   float64 
 4   chlorides             6497 non-null   float64 
 5   free sulfur dioxide   6497 non-null   float64 
 6   total sulfur dioxide  6497 non-null   float64 
 7   density               6497 non-null   float64 
 8   pH                    6497 non-null   float64 
 9   sulphates             6497 non-null   float64 
 10  alcohol               6497 non-null   float64 
 11  quality               6497 non-null   category
dtypes: category(1), float64(11)
memory usage: 615.6 KB
Out[32]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol
count 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000
mean 0.282257 0.173111 0.191948 0.074283 0.078129 0.102518 0.252868 0.146262 0.386435 0.174870 0.361131
std 0.107143 0.109758 0.087541 0.072972 0.058195 0.061630 0.130235 0.057811 0.124641 0.083599 0.172857
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.214876 0.100000 0.150602 0.018405 0.048173 0.055556 0.163594 0.100829 0.302326 0.117978 0.217391
50% 0.264463 0.140000 0.186747 0.036810 0.063123 0.097222 0.258065 0.149990 0.379845 0.162921 0.333333
75% 0.322314 0.213333 0.234940 0.115031 0.093023 0.138889 0.345622 0.190476 0.465116 0.213483 0.478261
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [33]:
wine_data_all_n
Out[33]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 0.264463 0.126667 0.216867 0.308282 0.059801 0.152778 0.377880 0.267785 0.217054 0.129213 0.115942 3
1 0.206612 0.146667 0.204819 0.015337 0.066445 0.045139 0.290323 0.132832 0.449612 0.151685 0.217391 3
2 0.355372 0.133333 0.240964 0.096626 0.068106 0.100694 0.209677 0.154039 0.418605 0.123596 0.304348 3
3 0.280992 0.100000 0.192771 0.121166 0.081395 0.159722 0.414747 0.163678 0.364341 0.101124 0.275362 3
4 0.280992 0.100000 0.192771 0.121166 0.081395 0.159722 0.414747 0.163678 0.364341 0.101124 0.275362 3
... ... ... ... ... ... ... ... ... ... ... ... ...
1594 0.198347 0.346667 0.048193 0.021472 0.134551 0.107639 0.087558 0.150183 0.565891 0.202247 0.362319 2
1595 0.173554 0.313333 0.060241 0.024540 0.088040 0.131944 0.103687 0.154425 0.620155 0.303371 0.463768 3
1596 0.206612 0.286667 0.078313 0.026074 0.111296 0.097222 0.078341 0.166377 0.542636 0.297753 0.434783 3
1597 0.173554 0.376667 0.072289 0.021472 0.109635 0.107639 0.087558 0.161172 0.658915 0.275281 0.318841 2
1598 0.181818 0.153333 0.283133 0.046012 0.096346 0.059028 0.082949 0.161558 0.519380 0.247191 0.434783 3

6497 rows × 12 columns

In [34]:
# Removing the quality attribute from the dataframe
wine_data_all_n_fs = wine_data_all_n.copy()
wine_data_all_n_fs.info()
X = wine_data_all_n_fs[wine_data_all_n_fs.columns[0:-1]]
y = wine_data_all_n_fs[wine_data_all_n_fs.columns[-1]]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   fixed acidity         6497 non-null   float64 
 1   volatile acidity      6497 non-null   float64 
 2   citric acid           6497 non-null   float64 
 3   residual sugar        6497 non-null   float64 
 4   chlorides             6497 non-null   float64 
 5   free sulfur dioxide   6497 non-null   float64 
 6   total sulfur dioxide  6497 non-null   float64 
 7   density               6497 non-null   float64 
 8   pH                    6497 non-null   float64 
 9   sulphates             6497 non-null   float64 
 10  alcohol               6497 non-null   float64 
 11  quality               6497 non-null   category
dtypes: category(1), float64(11)
memory usage: 615.6 KB
In [35]:
y.describe()
y
Out[35]:
0       3
1       3
2       3
3       3
4       3
       ..
1594    2
1595    3
1596    3
1597    2
1598    3
Name: quality, Length: 6497, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]
In [36]:
from sklearn.feature_selection import SelectKBest, f_classif
subset_features = SelectKBest(score_func=f_classif, k='all')
subset_features.fit(X, y)
X_train_ = subset_features.transform(X)
X_test_ = subset_features.transform(X)
#
In [37]:
# Analysing the physicochemical features that would be impacting the quality attribute the most and 
# perfoming feature selection by using the 'analysis of variance', ANOVA method.
#
# Removing the quality attribute from the dataframe
wine_data_all_n_fs = wine_data_all_n.copy()
# wine_data_all_n_fs.info()
X = wine_data_all_n_fs[wine_data_all_n_fs.columns[0:-1]]
y = wine_data_all_n_fs[wine_data_all_n_fs.columns[-1]]
X.info()
# y.info()
#
import plotly.express as px
from sklearn.feature_selection import SelectKBest, f_classif
subset_features = SelectKBest(score_func=f_classif, k='all')
subset_features.fit(X, y)
X_train_ = subset_features.transform(X)
X_test_ = subset_features.transform(X)
subset_features_names =  pd.DataFrame()
subset_features_names['Features'] = subset_features.feature_names_in_
subset_features_names['Score'] =  subset_features.scores_
subset_features_names.sort_values(by = 'Score' , ascending = False , inplace = True)
fig = px.histogram(subset_features_names, x='Features', y='Score', text_auto=True, color='Features', title='Scores:', template='simple_white')
fig.show()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 1598
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
dtypes: float64(11)
memory usage: 609.1 KB
In [38]:
# Based on the above scores, we see that the 'alcohol', 'density', 'volatile acidity' and 'chlorides' attributes would be
# impacting quality the most. These results are also consistent with the results of the correlation matrix. 
# The other interesting thing, is that the scores of the other seven physicochemical features, 
# would be very low as compared to these four attributes, and we can omit the lower score features and 
# use a subset of these four high score features for our predictive model building tasks. 
#
# Dropping the lower score features from the dataset and only keeping the four high score features namely: 
# 'alcohol', 'density', 'volatile acidity' and 'chlorides' for our analysis. 
wine_data_all_n_selected_subset_of_attributes = wine_data_all_n.drop(['fixed acidity', 'citric acid', 'residual sugar', 
                                                                     'free sulfur dioxide', 'total sulfur dioxide', 
                                                                     'pH', 'sulphates'], axis = 1)

wine_data_all_n_selected_subset_of_attributes.info()
wine_data_all_n_selected_subset_of_attributes.describe()
#
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 1598
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   volatile acidity  6497 non-null   float64 
 1   chlorides         6497 non-null   float64 
 2   density           6497 non-null   float64 
 3   alcohol           6497 non-null   float64 
 4   quality           6497 non-null   category
dtypes: category(1), float64(4)
memory usage: 260.3 KB
Out[38]:
volatile acidity chlorides density alcohol
count 6497.000000 6497.000000 6497.000000 6497.000000
mean 0.173111 0.078129 0.146262 0.361131
std 0.109758 0.058195 0.057811 0.172857
min 0.000000 0.000000 0.000000 0.000000
25% 0.100000 0.048173 0.100829 0.217391
50% 0.140000 0.063123 0.149990 0.333333
75% 0.213333 0.093023 0.190476 0.478261
max 1.000000 1.000000 1.000000 1.000000
In [39]:
#
wine_data_all_n_selected_subset_of_attributes.quality
#
Out[39]:
0       3
1       3
2       3
3       3
4       3
       ..
1594    2
1595    3
1596    3
1597    2
1598    3
Name: quality, Length: 6497, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]
In [40]:
wine_data_all_n_selected_subset_of_attributes.info()
wine_data_all_n_selected_subset_of_attributes.describe()
wine_data_all_n.info()
wine_data_all_n.describe()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 1598
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   volatile acidity  6497 non-null   float64 
 1   chlorides         6497 non-null   float64 
 2   density           6497 non-null   float64 
 3   alcohol           6497 non-null   float64 
 4   quality           6497 non-null   category
dtypes: category(1), float64(4)
memory usage: 260.3 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   fixed acidity         6497 non-null   float64 
 1   volatile acidity      6497 non-null   float64 
 2   citric acid           6497 non-null   float64 
 3   residual sugar        6497 non-null   float64 
 4   chlorides             6497 non-null   float64 
 5   free sulfur dioxide   6497 non-null   float64 
 6   total sulfur dioxide  6497 non-null   float64 
 7   density               6497 non-null   float64 
 8   pH                    6497 non-null   float64 
 9   sulphates             6497 non-null   float64 
 10  alcohol               6497 non-null   float64 
 11  quality               6497 non-null   category
dtypes: category(1), float64(11)
memory usage: 615.6 KB
Out[40]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol
count 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000 6497.000000
mean 0.282257 0.173111 0.191948 0.074283 0.078129 0.102518 0.252868 0.146262 0.386435 0.174870 0.361131
std 0.107143 0.109758 0.087541 0.072972 0.058195 0.061630 0.130235 0.057811 0.124641 0.083599 0.172857
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.214876 0.100000 0.150602 0.018405 0.048173 0.055556 0.163594 0.100829 0.302326 0.117978 0.217391
50% 0.264463 0.140000 0.186747 0.036810 0.063123 0.097222 0.258065 0.149990 0.379845 0.162921 0.333333
75% 0.322314 0.213333 0.234940 0.115031 0.093023 0.138889 0.345622 0.190476 0.465116 0.213483 0.478261
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [41]:
# Removing the quality attribute from the dataframe
# For dataset having all features
x_all = wine_data_all_n[wine_data_all_n.columns[0:-1]]
y_all = wine_data_all_n[wine_data_all_n.columns[-1]]
# x_all.info()
# x_all.describe()
# x_all
# y_all.info()
# y_all.describe()
# y_all
# For dataset having only the four selected features
x_all_sf = wine_data_all_n_selected_subset_of_attributes[wine_data_all_n_selected_subset_of_attributes.columns[0:-1]]
y_all_sf = wine_data_all_n_selected_subset_of_attributes[wine_data_all_n_selected_subset_of_attributes.columns[-1]]
# x_all_sf.info()
# x_all_sf.describe()
# x_all_sf
# y_all_sf.info()
# y_all_sf.describe()
# y_all_sf
In [42]:
# Dividing the dataset into training and testing sets
from sklearn.model_selection import train_test_split
# 80:20 ; train:test split
# For dataset having all features
x_all_train, x_all_test, y_all_train, y_all_test = train_test_split(x_all, y_all, test_size=.2, random_state=41)
# For dataset having only the four selected features
x_all_sf_train, x_all_sf_test, y_all_sf_train, y_all_sf_test = train_test_split(x_all_sf, y_all_sf, test_size=.2, random_state=41)
In [43]:
for data in [y_all_train, y_all_test]:
    print(data.describe())
for data in [y_all_sf_train, y_all_sf_test]:
    print(data.describe())
#
import warnings
warnings.filterwarnings("ignore")
#
count     5197
unique       3
top          3
freq      3291
Name: quality, dtype: int64
count     1300
unique       3
top          3
freq       817
Name: quality, dtype: int64
count     5197
unique       3
top          3
freq      3291
Name: quality, dtype: int64
count     1300
unique       3
top          3
freq       817
Name: quality, dtype: int64
In [44]:
# Decision Tree implementation: using all features
# 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
# 
# Finding the best combination of parameters to use:
model_dt_all = DecisionTreeClassifier(random_state = 111)
params_ = {"max_depth": range(1,11), "max_features": range(1,21), "criterion": ["gini", "entropy"]}
dt_all_p = GridSearchCV(model_dt_all, params_, cv=4)
dt_all_p.fit(x_all_train,y_all_train)
print(dt_all_p.best_params_)
{'criterion': 'gini', 'max_depth': 9, 'max_features': 8}
In [45]:
# Decision Tree implementation: using all features
# Using the above combination of parameters in our implementation
dt_all = DecisionTreeClassifier(criterion = 'gini', max_depth = 9, max_features = 8, random_state = 111)
dt_all.fit(x_all_train, y_all_train)
pred_dt_all = dt_all.predict(x_all_test)
print(classification_report(y_all_test, pred_dt_all, zero_division=0))
print("Accuracy:", accuracy_score(y_all_test, pred_dt_all))
cross_val_dt_all = cross_val_score(estimator=dt_all, X= x_all_train, y= y_all_train, cv=4)
print("Cross Validation Score: ", cross_val_dt_all.mean())
#
              precision    recall  f1-score   support

           2       0.70      0.65      0.68       482
           3       0.80      0.84      0.82       817
           4       0.00      0.00      0.00         1

    accuracy                           0.77      1300
   macro avg       0.50      0.50      0.50      1300
weighted avg       0.77      0.77      0.77      1300

Accuracy: 0.7684615384615384
Cross Validation Score:  0.7504321371469178
In [46]:
# Decision Tree implementation: using the subset of four selected features
#
# Finding the best combination of parameters to use:
model_dt_sf = DecisionTreeClassifier(random_state = 111)
params_ = {"max_depth": range(1,11), "max_features": range(1,21), "criterion": ["gini", "entropy"]}
dt_sf_p = GridSearchCV(model_dt_sf, params_, cv=4)
dt_sf_p.fit(x_all_sf_train,y_all_sf_train)
print(dt_sf_p.best_params_)
#
{'criterion': 'gini', 'max_depth': 6, 'max_features': 2}
In [47]:
# Decision Tree implementation: using the subset of four selected features
# Using the above combination of parameters in our implementation
dt_sf = DecisionTreeClassifier(criterion = 'gini', max_depth = 6, max_features = 2, random_state = 111)
dt_sf.fit(x_all_sf_train, y_all_sf_train)
pred_dt_sf = dt_sf.predict(x_all_sf_test)
print(classification_report(y_all_sf_test, pred_dt_sf, zero_division=0))
print("Accuracy:", accuracy_score(y_all_sf_test, pred_dt_sf))
cross_val_dt_sf = cross_val_score(estimator=dt_sf, X= x_all_sf_train, y= y_all_sf_train, cv=4)
print("Cross Validation Score: ", cross_val_dt_sf.mean())
#
              precision    recall  f1-score   support

           2       0.70      0.59      0.64       482
           3       0.78      0.85      0.81       817
           4       0.00      0.00      0.00         1

    accuracy                           0.75      1300
   macro avg       0.49      0.48      0.48      1300
weighted avg       0.75      0.75      0.75      1300

Accuracy: 0.7538461538461538
Cross Validation Score:  0.7383126073310831
In [48]:
# Random Forest implementation: using all features
# 
from sklearn.ensemble import RandomForestClassifier
# 
rf_all = RandomForestClassifier(random_state = 111)
rf_all.fit(x_all_train,y_all_train)
pred_rf_all = rf_all.predict(x_all_test)
print(classification_report(y_all_test, pred_rf_all, zero_division=0))
print("Accuracy:", accuracy_score(y_all_test, pred_rf_all))
cross_val_rf_all = cross_val_score(estimator=rf_all, X= x_all_train, y= y_all_train, cv=4)
print("Cross Validation Score: ", cross_val_rf_all.mean())
#
              precision    recall  f1-score   support

           2       0.80      0.72      0.76       482
           3       0.84      0.89      0.87       817
           4       0.00      0.00      0.00         1

    accuracy                           0.83      1300
   macro avg       0.55      0.54      0.54      1300
weighted avg       0.83      0.83      0.83      1300

Accuracy: 0.8284615384615385
Cross Validation Score:  0.8195111624326405
In [49]:
# Random Forest implementation: using the subset of four selected features
#
rf_sf = RandomForestClassifier(random_state = 111)
rf_sf.fit(x_all_sf_train,y_all_sf_train)
pred_rf_sf = rf_sf.predict(x_all_sf_test)
print(classification_report(y_all_sf_test, pred_rf_sf, zero_division=0))
print("Accuracy:", accuracy_score(y_all_sf_test, pred_rf_sf))
cross_val_rf_sf = cross_val_score(estimator=rf_sf, X= x_all_sf_train, y= y_all_sf_train, cv=4)
print("Cross Validation Score: ", cross_val_rf_sf.mean())
#
              precision    recall  f1-score   support

           2       0.78      0.69      0.73       482
           3       0.83      0.88      0.86       817
           4       0.00      0.00      0.00         1

    accuracy                           0.81      1300
   macro avg       0.54      0.53      0.53      1300
weighted avg       0.81      0.81      0.81      1300

Accuracy: 0.813076923076923
Cross Validation Score:  0.7810274175401196
In [50]:
# k-Nearest Neighbour (k-NN) implementation: using all features
# 
from sklearn.neighbors import KNeighborsClassifier
# 
# Finding the best combination of parameters to use:
model_kNN_all = KNeighborsClassifier()
# print(model_kNN_all.get_params())
params_ = [{'n_neighbors': [2,3,4,5,6], 'weights': ['uniform','distance'], 
                  'metric': ('minkowski', 'chebyshev', 'euclidean')}]
kNN_all_p = GridSearchCV(model_kNN_all, params_, cv=4)
kNN_all_p.fit(x_all_train,y_all_train)
print(kNN_all_p.best_params_)
{'metric': 'minkowski', 'n_neighbors': 6, 'weights': 'distance'}
In [51]:
# k-Nearest Neighbour (k-NN) implementation: using all features
# Using the above combination of parameters in our implementation
kNN_all = KNeighborsClassifier(metric = 'minkowski', n_neighbors = 6, weights = 'distance')
kNN_all.fit(x_all_train, y_all_train)
pred_kNN_all = kNN_all.predict(x_all_test)
print(classification_report(y_all_test, pred_kNN_all, zero_division=0))
print("Accuracy:", accuracy_score(y_all_test, pred_kNN_all))
cross_val_kNN_all = cross_val_score(estimator=kNN_all, X= x_all_train, y= y_all_train, cv=4)
print("Cross Validation Score: ", cross_val_kNN_all.mean())
#
              precision    recall  f1-score   support

           2       0.74      0.68      0.71       482
           3       0.82      0.86      0.84       817
           4       0.00      0.00      0.00         1

    accuracy                           0.79      1300
   macro avg       0.52      0.51      0.52      1300
weighted avg       0.79      0.79      0.79      1300

Accuracy: 0.7923076923076923
Cross Validation Score:  0.7966117427607036
In [52]:
# k-Nearest Neighbour (k-NN) implementation: using the subset of four selected features
#
# Finding the best combination of parameters to use:
model_kNN_sf = KNeighborsClassifier()
params_ = [{'n_neighbors': [2,3,4,5,6], 'weights': ['uniform','distance'], 
                  'metric': ('minkowski', 'chebyshev', 'euclidean')}]
kNN_sf_p = GridSearchCV(model_kNN_sf, params_, cv=4)
kNN_sf_p.fit(x_all_sf_train,y_all_sf_train)
print(kNN_sf_p.best_params_)
{'metric': 'chebyshev', 'n_neighbors': 6, 'weights': 'distance'}
In [53]:
# k-Nearest Neighbour (k-NN) implementation: using the subset of four selected features
# Using the above combination of parameters in our implementation
kNN_sf = KNeighborsClassifier(metric = 'chebyshev', n_neighbors = 6, weights = 'distance')
kNN_sf.fit(x_all_sf_train, y_all_sf_train)
pred_kNN_sf = kNN_sf.predict(x_all_sf_test)
print(classification_report(y_all_sf_test, pred_kNN_sf, zero_division=0))
print("Accuracy:", accuracy_score(y_all_sf_test, pred_kNN_sf))
cross_val_kNN_sf = cross_val_score(estimator=kNN_sf, X= x_all_sf_train, y= y_all_sf_train, cv=4)
print("Cross Validation Score: ", cross_val_kNN_sf.mean())
#
              precision    recall  f1-score   support

           2       0.76      0.67      0.72       482
           3       0.82      0.88      0.85       817
           4       0.00      0.00      0.00         1

    accuracy                           0.80      1300
   macro avg       0.53      0.52      0.52      1300
weighted avg       0.80      0.80      0.80      1300

Accuracy: 0.8007692307692308
Cross Validation Score:  0.7683263753182921
In [54]:
# Support Vector Machine (SVM) implementation: using all features
# 
from sklearn.svm import SVC
# 
svm_all = SVC(random_state = 111)
# print(svm_all.get_params())
svm_all.fit(x_all_train, y_all_train)
pred_svm_all = svm_all.predict(x_all_test)
print(classification_report(y_all_test, pred_svm_all, zero_division=0))
print("Accuracy:", accuracy_score(y_all_test, pred_svm_all))
cross_val_svm_all = cross_val_score(estimator=svm_all, X= x_all_train, y= y_all_train, cv=4)
print("Cross Validation Score: ", cross_val_svm_all.mean())
#
              precision    recall  f1-score   support

           2       0.74      0.60      0.66       482
           3       0.78      0.87      0.83       817
           4       0.00      0.00      0.00         1

    accuracy                           0.77      1300
   macro avg       0.51      0.49      0.50      1300
weighted avg       0.77      0.77      0.76      1300

Accuracy: 0.77
Cross Validation Score:  0.7562077337596969
In [55]:
# Support Vector Machine (SVM) implementation: using the subset of four selected features
#
svm_sf = SVC(random_state = 111)
svm_sf.fit(x_all_sf_train,y_all_sf_train)
pred_svm_sf = svm_sf.predict(x_all_sf_test)
print(classification_report(y_all_sf_test, pred_svm_sf, zero_division=0))
print("Accuracy:", accuracy_score(y_all_sf_test, pred_svm_sf))
cross_val_svm_sf = cross_val_score(estimator=svm_sf, X= x_all_sf_train, y= y_all_sf_train, cv=4)
print("Cross Validation Score: ", cross_val_svm_sf.mean())
#
              precision    recall  f1-score   support

           2       0.72      0.57      0.63       482
           3       0.77      0.87      0.82       817
           4       0.00      0.00      0.00         1

    accuracy                           0.76      1300
   macro avg       0.50      0.48      0.48      1300
weighted avg       0.75      0.76      0.75      1300

Accuracy: 0.7569230769230769
Cross Validation Score:  0.739081986143187
In [56]:
# We have implemented the four algorithms, Decision Tree, Random Forest, k-Nearest Neighbour (k-NN) and 
# Support Vector Machine (SVM), first by using all the attributes in the dataset and then by using 
# the subset of the four most-relevant selected features ('alcohol', 'density', 'volatile acidity' and 'chlorides').
#
# After reviewing the model performance of all the implemented iterations, we see that the performance indicators, 
# when using the full set of features and when using the subset of the four selected features, 
# is almost the same, for all the four algorithms. 
#
# This would be consistent with our ANOVA (analysis of variance) findings, 
# where we see that the scores of the other seven physicochemical features is very low as compared to these four features,
# hence a much lower impact on the ‘quality’ attribute. 
#
# We also see that all of the four algorithms provide robust performance. 
# The accuracy scores would be between the ranges, 75.384% and 82.846%.
# The lowest, 75.384% for the Decision Tree implementation, using the subset of four selected features and 
# the highest, 82.846% for the Random Forest implementation, using all features in the sample set. 
#